Copyright (C) 2019-2021 Martin Engqvist
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.
import sys
import os
from dotenv import load_dotenv, find_dotenv # do 'pip install python-dotenv'
from os.path import join, dirname, basename, exists, isdir
### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()
# load up the entries as environment variables
load_dotenv(dotenv_path)
# now you can get the variables using their names
# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
pass
else:
pass
#mount network drive here
# set up directory paths
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory
DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory
FIGURES = join(PROJ, 'figures') # figure output directory
#from orgtools import topfunctions, org_tax, uid_tax
from ete3 import Tree, NodeStyle, TextFace, TreeStyle
import ete3
import pandas as pd
import matplotlib
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import scipy
import scipy.cluster.hierarchy as sch
font = {'family' : 'DejaVu Sans',
'weight' : 'normal',
'size' : 8}
matplotlib.rc('font', **font)
print('Python version: %s' % (sys.version))
print('Pandas version: %s' % pd.__version__)
print('Matplotlib version: %s' % matplotlib.__version__)
print('ETE3 version: %s' % ete3.__version__)
df = pd.read_csv(join(FINAL, '332_yeast_genomes_enzyme_info_version_3.tsv'), sep='\t')
display(df.dtypes)
display(df.head())
display(df.tail())
display(df.describe())
display(df.groupby(['enzyme_type'])['gene'].count())
df.groupby(['enzyme_type'])['gene'].count().plot(kind='bar')
plt.ylabel('Total identified (#)')
Paper doi: 10.1016/j.cell.2018.10.023
Data doi: 10.6084/m9.figshare.5854692
Figshare link: https://figshare.com/articles/Tempo_and_mode_of_genome_evolution_in_the_budding_yeast_subphylum/5854692
def norm_organism(orgname):
'''
Return abbreviated organism name
'''
orgname = orgname.replace('[', '').replace(']', '')
# two organisms occur twice, deal with these special cases
if orgname == 'Metschnikowia matae var. maris':
return 'metschnikowia_matae_maris'
elif orgname == 'Nadsonia fulvescens var. elongata':
return 'nadsonia_fulvescens_var_elongata'
# now parse filename for the others
if orgname.startswith('yH'):
organism = '_'.join(orgname.split()[1:3]).lower().replace('.', '')
else:
organism = '_'.join(orgname.split()[:2]).lower().replace('.', '')
return organism
def translate_organism(orgname):
'''
The organism names are different between
my data file and that used in the phylogenetic tree.
Translate my names to the ones used in the tree.
'''
dictionary = {'nadsonia_fulvescens':'nadsonia_fulvescens_var_fulvescens',
'magnusiomyces_tetrasperma':'magnusiomyces_tetraspermus',
'candida_versatilis':'wickerhamiella_versatilis',
'candida_infanticola':'wickerhamiella_infanticola',
'candida_salmanticensis':'groenewaldozyma_salmanticensis',
'hanseniaspora_vinae':'hanseniaspora_vineae',
'candida_apicola':'starmerella_apicola',
'arxula_adeninivorans':'blastobotrys_adeninivorans',
'blastobotrys_raffinofermentans':'blastobotrys_raffinosifermentans',
'candida_homilentoma':'hyphopichia_homilentoma',
'candida_azyma':'wickerhamomyces_sp_yb_2243',
'candida_tenuis':'yamadazyma_tenuis',
'nakaseomyces_castellii':'candida_castellii',
'ogataea_populiabae':'ogataea_populialbae',
'nakaseomyces_bracarensis':'candida_bracarensis',
'candida_pyralidae':'suhomyces_pyralidae',
'candida_gatunensis':'teunomyces_gatunensis',
'dekkera_bruxellensis':'brettanomyces_bruxellensis',
'nakaseomyces_nivariensis':'candida_nivariensis',
'candida_tanzawaensis':'suhomyces_tanzawaensis',
'candida_kruisii':'teunomyces_kruisii',
'candida_cretensis':'teunomyces_cretensis',
'metschnikowia_bicuspidata':'metschnikowia_bicuspidata_var_bicuspidata',
'lachancea_fantastica':'lachancea_fantastica_nom_nud',
'candida_emberorum':'suhomyces_emberorum',
'candida_canberraensis':'suhomyces_canberraensis',
'ogataea_philodendra':'ogataea_philodendri',
'metschnikowia_dekortum':'metschnikowia_dekortorum',
'metschnikowia_lockheadii':'metschnikowia_lochheadii',
'metschnikowia_matae_maris':'metschnikowia_matae_var_maris',
'metschnikowia_matae':'metschnikowia_matae_var_matae'}
return dictionary.get(orgname)
def load_tree():
filepath = join(RAW_EXTERNAL, 'phylo_tree', '332_2408OGs_time-calibrated_phylogeny_species-names_updated.newick')
tree = Tree(filepath)
ts = TreeStyle()
ts.mode = "c"
ts.scale = 175
ts.show_scale = False
return tree, ts
tree, ts = load_tree()
# save tree with the updated names
#tree.write(format=1, outfile=join('phylo_tree', '332_genomes_my_names.newick'))
tree.render("%%inline", dpi=600, w=2400, tree_style=ts)
def color_tree(tree, data, cmap):
# now add in keys for the alternate org names
for name in sorted(data.keys()):
alt_name = translate_organism(name)
if alt_name is not None:
data[alt_name] = data[name]
orgs_with_data = data.keys()
# setup a way to normalize values to the colormap
norm = matplotlib.colors.Normalize(vmin=min(data.values()), vmax=max(data.values()))
# go through and color the nodes according to how many enzymes they have
for node in tree.traverse():
# Hide node circles
#node.img_style['size'] = 0
if node.is_leaf():
name = norm_organism(node.name)
if not name in orgs_with_data:
raise ValueError
rgba = cmap(norm(data[name]))
color = matplotlib.colors.to_hex(rgba, keep_alpha=False)
# change the color of the leaf
nstyle = NodeStyle()
# nstyle["bgcolor"] = color
# nstyle["size"] = 0
nstyle["fgcolor"] = color
nstyle["size"] = 15
# add teh color
node.set_style(nstyle)
# make the name pretty
node.name = name.replace('_', ' ').capitalize()
else:
# change the color of the node
nstyle = NodeStyle()
nstyle["size"] = 0
# apply
node.set_style(nstyle)
return tree
tree, ts = load_tree()
# count the families per organism (with excluding GT)
data = df[df.enzyme_type != 'GT'].groupby('organism')['family'].count().reset_index()
# convert data to a dictionary with organism keys and data values
data = data.set_index('organism')['family'].to_dict()
# setup colormap
cmap = matplotlib.cm.get_cmap('YlOrRd')
# add color to the tree
tree = color_tree(tree, data, cmap)
# define output filename
filename = '332_tree'
# save the colormap (scale) for this plot
mat = np.array([[min(data.values())], [max(data.values())]])
img = plt.imshow(mat, origin="lower", cmap=cmap, interpolation='nearest')
plt.xticks([])
plt.yticks([])
cbar = plt.colorbar()
cbar.set_label('Enzyme families (#)', rotation=270, labelpad=15, fontsize=10)
plt.savefig(join(FIGURES, '{}_colorbar.png'.format(filename)))
plt.savefig(join(FIGURES, '{}_colorbar.pdf'.format(filename)))
# save pdf
tree.render(join(FIGURES, "{}.pdf".format(filename)), w=2400, tree_style=ts)
# save png
tree.render(join(FIGURES, "{}.png".format(filename)), w=2400, dpi=600, tree_style=ts)
# plot it
tree.render("%%inline", w=2400, dpi=100, tree_style=ts)
# first I need to group the families according to the substrate they use
family_df = pd.read_csv(join(FINAL, '332_yeast_genomes_heatmap_data.tsv'), sep='\t')
# create a new data frame and set the organism name as index
substrate_df = pd.DataFrame(family_df.organism.values)
substrate_df.columns = ['Name']
# define substrate families
substrates = {'xylan':['GH3','GH5','GH11','GH30','GH43','GH51','GH62','GH67','GH115','CE1','CE3','CE4','CE5','CE12','CE15','CE16','CBM13'],
'mannan':['GH2','GH5','GH26','GH27','GH76','GH125','GH134'],
'xyloglucan':['GH2','GH3','GH5','GH12','GH16','GH29','GH31','GH35','GH42','GH45','GH74','GH95'],
'beta-glucan':['GH3','GH16','GH55'],
'lignin':['AA1','AA2','CE1','CE15'],
'cellulose':['AA3','AA9','GH1','GH3','GH5','GH7','GH8','GH12','GH26','GH45','GH51','GH74','CBM1'],
'chitin':['GH7','GH8','GH16','GH18','GH20','GH75','AA11','CE4','CBM18','CBM19','CBM32','CBM50'],
'pectin':['PL1','PL3','PL4','PL26','GH28','GH78','CE8','GH106','GH139'],
'starch':['GH13','GH15','GH31','GH63','CBM21'],}
#GH46???
# add in substrates
for sub in sorted(substrates.keys()):
substrate_df[sub] = family_df[substrates[sub]].sum(axis=1)
#### Try to make organism names in substrate data match those in the tree #####
# go through and change the names to the alternate ones
for node in tree.traverse():
if node.is_leaf():
tree_name = node.name
tree_norm_name = norm_organism(tree_name)
for subst_name in substrate_df.Name:
subst_alt_name = translate_organism(subst_name)
if subst_alt_name is not None:
search_name = subst_alt_name
else:
search_name = subst_name
if search_name == tree_norm_name:
# print(subst_name, tree_name)
substrate_df.loc[substrate_df.Name == subst_name, 'Name'] = tree_name
# change the index column
substrate_df = substrate_df.set_index('Name')
display(substrate_df.head())
display(substrate_df.tail())
display(substrate_df.describe())
# save data
substrate_df.to_csv(join(FINAL, '332_yeast_genomes_heatmap_data_substrates_tree_names.tsv'), sep='\t', index_label='#Names')
# get the organism names and substrate names
y_labels = [s.replace('_', ' ').capitalize() for s in substrate_df.index]
x_labels = [s.capitalize() for s in substrate_df.columns]
# setup the heatmap
fig, ax = plt.subplots(1,1, figsize=(0.25*len(x_labels), 0.25*len(y_labels)))
img = ax.imshow(substrate_df.to_numpy(), cmap='YlOrRd')
# add names
ax.xaxis.tick_top()
ax.set_xticks(np.arange(0, len(x_labels)))
ax.set_yticks(np.arange(0, len(y_labels)))
ax.set_xticklabels(x_labels, rotation=90)
ax.set_yticklabels(y_labels)
# add the colorbar
cbar = fig.colorbar(img, shrink=0.35)
cbar.set_label('Substrate families (#)', rotation=270, labelpad=15, fontsize=10)
filename = 'all_orgs_substrate_families'
plt.savefig(join(FIGURES, '{}.png'.format(filename)))
plt.savefig(join(FIGURES, '{}.pdf'.format(filename)))
# chosen_orgs = ['aciculoconidium_aculeatum',
# 'ambrosiozyma_ambrosiae',
# 'ambrosiozyma_monospora',
# 'ambrosiozyma_oregonensis',
# 'ambrosiozyma_philentoma',
# 'ascoidea_asiatica',
# 'ascoidea_rubescens',
# 'arxula_adeninivorans', # actually 'blastobotrys_adeninivorans'
# 'blastobotrys_americana',
# 'blastobotrys_mokoenaii',
# 'blastobotrys_muscicola',
# 'blastobotrys_nivea',
# 'blastobotrys_peoriensis',
# 'blastobotrys_proliferans',
# 'blastobotrys_raffinofermentans',
# 'blastobotrys_serpentis',
# 'candida_gorgasii',
# 'debaryomyces_fabryi',
# 'debaryomyces_maramus',
# 'debaryomyces_nepalensis',
# 'debaryomyces_prosopidis',
# 'debaryomyces_subglobosus',
# 'diddensiella_caesifluorescens',
# 'lipomyces_arxii',
# 'lipomyces_doorenjongii',
# 'lipomyces_kononenkoae',
# 'lipomyces_lipofer',
# 'lipomyces_starkeyi',
# 'saccharomycopsis_capsularis',
# 'saccharomycopsis_malanga',
# 'scheffersomyces_lignosus',
# 'scheffersomyces_stipitis',
# 'spathaspora_passalidarum',
# 'spencermartinsiella_europaea',
# 'sugiyamaella_lignohabitans',
# 'wickerhamomyces_anomalus',
# 'wickerhamomyces_canadensis',
# 'wickerhamomyces_ciferrii',
# 'wickerhamomyces_hampshirensis']
chosen_orgs = ['lipomyces_starkeyi',
'lipomyces_arxii',
'lipomyces_kononenkoae',
'lipomyces_lipofer',
'lipomyces_doorenjongii',
'spencermartinsiella_europaea',
'sugiyamaella_lignohabitans',
'diddensiella_caesifluorescens',
'blastobotrys_peoriensis',
'blastobotrys_nivea',
'blastobotrys_muscicola',
'blastobotrys_mokoenaii',
'blastobotrys_americana',
'arxula_adeninivorans',
'blastobotrys_proliferans',
'blastobotrys_serpentis',
'blastobotrys_raffinofermentans',
'ambrosiozyma_philentoma',
'ambrosiozyma_ambrosiae',
'ambrosiozyma_monospora',
'ambrosiozyma_oregonensis',
'spathaspora_passalidarum',
'debaryomyces_subglobosus',
'debaryomyces_fabryi',
'debaryomyces_prosopidis',
'debaryomyces_hansenii',
'debaryomyces_nepalensis',
'candida_gorgasii',
'debaryomyces_maramus',
'scheffersomyces_lignosus',
'scheffersomyces_stipitis',
'aciculoconidium_aculeatum',
'wickerhamomyces_anomalus',
'wickerhamomyces_ciferrii',
'wickerhamomyces_canadensis',
'wickerhamomyces_hampshirensis',
'saccharomycopsis_capsularis',
'saccharomycopsis_malanga',
'ascoidea_asiatica',
'ascoidea_rubescens']
# first I need to group the families according to the substrate they use
family_df = pd.read_csv(join(FINAL, '332_yeast_genomes_heatmap_data.tsv'), sep='\t')
# create a new data frame and set the organism name as index
substrate_df = pd.DataFrame(family_df.organism.values)
substrate_df.columns = ['Name']
# add in substrates
for sub in sorted(substrates.keys()):
substrate_df[sub] = family_df[substrates[sub]].sum(axis=1)
# change the index column
substrate_df = substrate_df.set_index('Name')
display(substrate_df.head())
display(substrate_df.tail())
display(substrate_df.describe())
# save data
substrate_df.to_csv(join(FINAL, '332_yeast_genomes_heatmap_data_substrates.tsv'), sep='\t', index_label='#Names')
for org in chosen_orgs:
if org not in substrate_df.index:
[print(org)]
substrate_df_chosen = substrate_df.loc[chosen_orgs]
# get the organism names and substrate names
y_labels = [s.replace('arxula_adeninivorans', 'blastobotrys_adeninivorans').replace('_', ' ').capitalize()
for s in substrate_df_chosen.index]
x_labels = [s.capitalize() for s in substrate_df.columns]
# setup the heatmap
fig, ax = plt.subplots(1,1, figsize=(0.25*len(x_labels), 0.25*len(y_labels)))
img = ax.imshow(substrate_df_chosen.to_numpy(), cmap='YlOrRd')
# add names
ax.xaxis.tick_top()
ax.set_xticks(np.arange(0, len(x_labels)))
ax.set_yticks(np.arange(0, len(y_labels)))
ax.set_xticklabels(x_labels, rotation=90)
ax.set_yticklabels(y_labels)
# add the colorbar
cbar = fig.colorbar(img, shrink=0.35)
cbar.set_label('Substrate families (#)', rotation=270, labelpad=15, fontsize=10)
filename = 'selected_orgs_substrate_families'
plt.savefig(join(FIGURES, '{}.png'.format(filename)), pad_inches=3)
plt.savefig(join(FIGURES, '{}.pdf'.format(filename)), pad_inches=3)